# from google.colab import drive
# drive.mount('/content/drive')
# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd
# Libraries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Command to tell Python to actually display the graphs
%matplotlib inline
df = pd.read_csv('Automobile.csv')
# df = pd.read_csv('/location on your computer/Automobile.csv')
df.head()
| symboling | normalized_losses | make | fuel_type | aspiration | number_of_doors | body_style | drive_wheels | engine_location | wheel_base | ... | engine_size | fuel_system | bore | stroke | compression_ratio | horsepower | peak_rpm | city_mpg | highway_mpg | price | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3 | 168 | alfa-romero | gas | std | two | convertible | rwd | front | 88.6 | ... | 130 | mpfi | 3.47 | 2.68 | 9.0 | 111 | 5000 | 21 | 27 | 13495 |
| 1 | 3 | 168 | alfa-romero | gas | std | two | convertible | rwd | front | 88.6 | ... | 130 | mpfi | 3.47 | 2.68 | 9.0 | 111 | 5000 | 21 | 27 | 16500 |
| 2 | 1 | 168 | alfa-romero | gas | std | two | hatchback | rwd | front | 94.5 | ... | 152 | mpfi | 2.68 | 3.47 | 9.0 | 154 | 5000 | 19 | 26 | 16500 |
| 3 | 2 | 164 | audi | gas | std | four | sedan | fwd | front | 99.8 | ... | 109 | mpfi | 3.19 | 3.40 | 10.0 | 102 | 5500 | 24 | 30 | 13950 |
| 4 | 2 | 164 | audi | gas | std | four | sedan | 4wd | front | 99.4 | ... | 136 | mpfi | 3.19 | 3.40 | 8.0 | 115 | 5500 | 18 | 22 | 17450 |
5 rows × 26 columns
df.shape
(201, 26)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 201 entries, 0 to 200 Data columns (total 26 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 symboling 201 non-null int64 1 normalized_losses 201 non-null int64 2 make 201 non-null object 3 fuel_type 201 non-null object 4 aspiration 201 non-null object 5 number_of_doors 201 non-null object 6 body_style 201 non-null object 7 drive_wheels 201 non-null object 8 engine_location 201 non-null object 9 wheel_base 201 non-null float64 10 length 201 non-null float64 11 width 201 non-null float64 12 height 201 non-null float64 13 curb_weight 201 non-null int64 14 engine_type 201 non-null object 15 number_of_cylinders 201 non-null object 16 engine_size 201 non-null int64 17 fuel_system 201 non-null object 18 bore 201 non-null float64 19 stroke 201 non-null float64 20 compression_ratio 201 non-null float64 21 horsepower 201 non-null int64 22 peak_rpm 201 non-null int64 23 city_mpg 201 non-null int64 24 highway_mpg 201 non-null int64 25 price 201 non-null int64 dtypes: float64(7), int64(9), object(10) memory usage: 41.0+ KB
df.describe(include='all').T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| symboling | 201.0 | NaN | NaN | NaN | 0.840796 | 1.254802 | -2.0 | 0.0 | 1.0 | 2.0 | 3.0 |
| normalized_losses | 201.0 | NaN | NaN | NaN | 125.189055 | 33.572966 | 65.0 | 101.0 | 122.0 | 150.0 | 256.0 |
| make | 201 | 22 | toyota | 32 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| fuel_type | 201 | 2 | gas | 181 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| aspiration | 201 | 2 | std | 165 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| number_of_doors | 201 | 2 | four | 114 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| body_style | 201 | 5 | sedan | 94 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| drive_wheels | 201 | 3 | fwd | 118 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| engine_location | 201 | 2 | front | 198 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| wheel_base | 201.0 | NaN | NaN | NaN | 98.797015 | 6.066366 | 86.6 | 94.5 | 97.0 | 102.4 | 120.9 |
| length | 201.0 | NaN | NaN | NaN | 174.200995 | 12.322175 | 141.1 | 166.8 | 173.2 | 183.5 | 208.1 |
| width | 201.0 | NaN | NaN | NaN | 65.889055 | 2.101471 | 60.3 | 64.1 | 65.5 | 66.6 | 72.0 |
| height | 201.0 | NaN | NaN | NaN | 53.766667 | 2.447822 | 47.8 | 52.0 | 54.1 | 55.5 | 59.8 |
| curb_weight | 201.0 | NaN | NaN | NaN | 2555.666667 | 517.296727 | 1488.0 | 2169.0 | 2414.0 | 2926.0 | 4066.0 |
| engine_type | 201 | 6 | ohc | 145 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| number_of_cylinders | 201 | 7 | four | 157 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| engine_size | 201.0 | NaN | NaN | NaN | 126.875622 | 41.546834 | 61.0 | 98.0 | 120.0 | 141.0 | 326.0 |
| fuel_system | 201 | 8 | mpfi | 92 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| bore | 201.0 | NaN | NaN | NaN | 3.329701 | 0.268166 | 2.54 | 3.15 | 3.31 | 3.58 | 3.94 |
| stroke | 201.0 | NaN | NaN | NaN | 3.261741 | 0.317875 | 2.07 | 3.11 | 3.29 | 3.46 | 4.17 |
| compression_ratio | 201.0 | NaN | NaN | NaN | 10.164279 | 4.004965 | 7.0 | 8.6 | 9.0 | 9.4 | 23.0 |
| horsepower | 201.0 | NaN | NaN | NaN | 103.263682 | 37.389372 | 48.0 | 70.0 | 95.0 | 116.0 | 262.0 |
| peak_rpm | 201.0 | NaN | NaN | NaN | 5121.393035 | 479.624905 | 4150.0 | 4800.0 | 5200.0 | 5500.0 | 6600.0 |
| city_mpg | 201.0 | NaN | NaN | NaN | 25.179104 | 6.42322 | 13.0 | 19.0 | 24.0 | 30.0 | 49.0 |
| highway_mpg | 201.0 | NaN | NaN | NaN | 30.686567 | 6.81515 | 16.0 | 25.0 | 30.0 | 34.0 | 54.0 |
| price | 201.0 | NaN | NaN | NaN | 13207.129353 | 7947.066342 | 5118.0 | 7775.0 | 10295.0 | 16500.0 | 45400.0 |
sns.histplot(data=df, x='price')
<AxesSubplot:xlabel='price', ylabel='Count'>
Let's see how we can customize a histogram.
plt.title('Histogram:Price')
plt.xlim(3000,50000)
plt.ylim(0,70)
plt.xlabel('Price of cars')
plt.ylabel('Frequency')
sns.histplot(data=df, x='price',color='orange');
We can specify the number of intervals (or groups or bins) to create by setting the bins parameter.
sns.histplot(data=df, x='price', bins=5)
<AxesSubplot:xlabel='price', ylabel='Count'>
sns.histplot(data=df, x='price', bins=20)
<AxesSubplot:xlabel='price', ylabel='Count'>
If we want to specify the width of the intervals (or groups or bins), we can use binwidth parameter.
sns.histplot(data=df, x='price', binwidth=20)
<AxesSubplot:xlabel='price', ylabel='Count'>
sns.histplot(data=df, x='price', binwidth=200)
<AxesSubplot:xlabel='price', ylabel='Count'>
How to find the optimal number of bins: Rule of thumb
We calculate the bin-width first, using the following formula: $$ binwidth =\frac{(2 * IQR)}{\sqrt[3]{n}} $$ where n = number of rows the dataset
Then, we obtain bins using the calculated bin-width. $$ bins =\frac{Range}{binwidth} $$
In addition to the bars, we can also add a density estimate by setting the kde parameter to True.
sns.histplot(data=df, x='price', kde=True);
sns.histplot(data=df, x='price', bins=700, kde=True);
Clearly, if we increase the number of bins, it reduces the frequency count in each group (bin). Since the scale of KDE depends on the total frequency of each bin (group), the above code gives us a flattened KDE plot.
Let's check out the histograms for a few more attributes in the data.
sns.histplot(data=df, x='curb_weight', kde=True);
sns.histplot(data=df, x='horsepower', kde=True);
Histograms are intuitive but it is hardly a good choice when we want to compare the distributions of several groups. For example,
sns.histplot(data=df, x='price', hue='body_style', kde=True);
It might be better to use subplots!
g = sns.FacetGrid(df, col="body_style")
g.map(sns.histplot, "price");
In such cases, we can use boxplots. Boxplots, or box-and-whiskers plots, are an excellent way to visualize differences among groups.
from IPython.display import Image
# Image('/content/drive/MyDrive/Python Course/boxplot.png')
Image('boxplot.png')
# creating a boxplot with seaborn
sns.boxplot(data=df, x='curb_weight')
<AxesSubplot:xlabel='curb_weight'>
Let's see how we can customize a boxplot.
plt.title('Boxplot:Horsepower')
plt.xlim(30,300)
plt.xlabel('Horsepower')
sns.axes_style('whitegrid')
sns.boxplot(data=df, x='horsepower',color='green');
from IPython.display import Image
# Image('/content/drive/MyDrive/skew_box.png')
Image('skew_box.png')
For example,
sns.boxplot(data=df, x='price')
<AxesSubplot:xlabel='price'>
From the above plot, we can see that the distribution of price is positively skewed.
Let's see how we can compare groups with boxplots.
sns.boxplot(data=df, x='body_style', y='price') ;
Though boxplot visually summarizes variation in large datasets, it is unable to show multimodality and clusters.
sns.boxplot(data=df, x='bore')
<AxesSubplot:xlabel='bore'>
sns.histplot(data=df, x='bore',kde = True)
<AxesSubplot:xlabel='bore', ylabel='Count'>
sns.countplot(data=df, x='body_style')
<AxesSubplot:xlabel='body_style', ylabel='count'>
We can also make the plot more granular by specifying the hue parameter to display counts for subgroups.
sns.countplot(data=df, x='body_style', hue='fuel_type')
<AxesSubplot:xlabel='body_style', ylabel='count'>
Let's check out the bar graphs for a few more attributes in the data.
sns.countplot(data=df, x='make')
<AxesSubplot:xlabel='make', ylabel='count'>
plt.figure(figsize=(20,7))
sns.countplot(data=df, x='make')
<AxesSubplot:xlabel='make', ylabel='count'>
plt.figure(figsize=(20,7))
sns.countplot(data=df, x='make')
plt.xticks(rotation=90)
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21]),
[Text(0, 0, 'alfa-romero'),
Text(1, 0, 'audi'),
Text(2, 0, 'bmw'),
Text(3, 0, 'chevrolet'),
Text(4, 0, 'dodge'),
Text(5, 0, 'honda'),
Text(6, 0, 'isuzu'),
Text(7, 0, 'jaguar'),
Text(8, 0, 'mazda'),
Text(9, 0, 'mercedes-benz'),
Text(10, 0, 'mercury'),
Text(11, 0, 'mitsubishi'),
Text(12, 0, 'nissan'),
Text(13, 0, 'peugot'),
Text(14, 0, 'plymouth'),
Text(15, 0, 'porsche'),
Text(16, 0, 'renault'),
Text(17, 0, 'saab'),
Text(18, 0, 'subaru'),
Text(19, 0, 'toyota'),
Text(20, 0, 'volkswagen'),
Text(21, 0, 'volvo')])
plt.figure(figsize=(20,7))
sns.countplot(data=df, x='make')
plt.xticks(rotation=90)
plt.show() # this will ensure that the plot is displayed without the text
Here are some common ways to customize a barplot.
plt.figure(figsize=(10,7))
plt.title('Barplot:Engine-type')
plt.ylim(0,180)
sns.countplot(data=df, x='engine_type',hue='fuel_type')
plt.xlabel('Engine-type')
Text(0.5, 0, 'Engine-type')
Suppose, your dataset has multiple y values for each x value. A lineplot is a great way to visualize this. This type of data often shows up when we have data that evolves over time, for example, when we have monthly data over several years. If we want to compare the individual months, then a line plot is a great option. This is sometimes called seasonality analysis.
from IPython.display import Image
# Image('/content/drive/MyDrive/Python Course/Line_plot.png')
Image('Line_plot.png')
A line plot uses straight lines to connect individual data points to display a trend or pattern in the data.
The lineplot() function of seaborn, by default, aggregates over multiple y values at each value of x and uses an estimate of the central tendency for the plot.
lineplot() assumes that you are most often trying to draw y as a function of x. So, by default, it sorts the data by the x values before plotting.
# loading one of the example datasets available in seaborn
flights = sns.load_dataset("flights")
# creating a line plot
sns.lineplot(data = flights , x = 'month' , y = 'passengers');
The light blue shaded area is actually the 'confidence interval' of the y-value estimates for each x-axis value.
The confidence interval is a range of values around that estimate that are believed to contain the true value of that estimate with a certain probability.
We can switch off the confidence intervals by setting the ci parameter to 'False'.
sns.lineplot(data = flights , x = 'month' , y = 'passengers', ci = False);
/var/folders/vw/476pkr6x3s59w2nfh1xx99mw0000gn/T/ipykernel_16437/1033138464.py:1: FutureWarning:
The `ci` parameter is deprecated. Use `errorbar=('ci', False)` for the same effect.
sns.lineplot(data = flights , x = 'month' , y = 'passengers', ci = False);
We can also check the relationship between two variables for different categories by specifying the hue parameter.
sns.lineplot(data=flights,x = 'month' , y = 'passengers', ci = False ,hue='year');
/var/folders/vw/476pkr6x3s59w2nfh1xx99mw0000gn/T/ipykernel_16437/3583462331.py:1: FutureWarning:
The `ci` parameter is deprecated. Use `errorbar=('ci', False)` for the same effect.
sns.lineplot(data=flights,x = 'month' , y = 'passengers', ci = False ,hue='year');
We can change the style of the lines by adding 'style' parameter to the function.
# loading one of the example datasets available in seaborn
fmri = sns.load_dataset("fmri")
# creating the line plot
sns.lineplot(data = fmri, x="timepoint", y="signal", hue="region", style="region", ci = False);
/var/folders/vw/476pkr6x3s59w2nfh1xx99mw0000gn/T/ipykernel_16437/11193929.py:5: FutureWarning:
The `ci` parameter is deprecated. Use `errorbar=('ci', False)` for the same effect.
sns.lineplot(data = fmri, x="timepoint", y="signal", hue="region", style="region", ci = False);
We can also add markers at each observation to identify groups in a better way.
sns.lineplot(data = fmri, x="timepoint", y="signal", hue="region", style="region", ci = False, markers = True);
/var/folders/vw/476pkr6x3s59w2nfh1xx99mw0000gn/T/ipykernel_16437/1262815301.py:1: FutureWarning:
The `ci` parameter is deprecated. Use `errorbar=('ci', False)` for the same effect.
sns.lineplot(data = fmri, x="timepoint", y="signal", hue="region", style="region", ci = False, markers = True);
Let's customize the lineplot for a better visualization.
plt.figure(figsize = (15,7))
sns.lineplot(data = flights , x = 'month' , y = 'passengers', hue = 'year')
plt.ylabel('Number of Passengers')
plt.legend(bbox_to_anchor=[1, 1]); #another way to change the legend's location in the plot
Sometimes we want to know if two variables mean something when put together, whether a small change in one variable affects the other variable. In such cases, plotting a scatterplot, or scatter-diagram, with our data points can help us to check whether there is a potential relationship between them.
sns.scatterplot(data=df, x='engine_size', y='horsepower')
<AxesSubplot:xlabel='engine_size', ylabel='horsepower'>
We can also check the relationship between two variables for different categories by specifying the hue parameter.
sns.scatterplot(data=df, x='engine_size', y='horsepower', hue='fuel_type')
<AxesSubplot:xlabel='engine_size', ylabel='horsepower'>
We can assign the same variable as hue to another parameter style which will vary the markers and create a more readable plot.
sns.scatterplot(data=df, x='engine_size', y='horsepower', hue='fuel_type', style='fuel_type')
<AxesSubplot:xlabel='engine_size', ylabel='horsepower'>
Correlation
Correlation means association. More precisely, it expresses the extent to which two variables change together at a constant rate.
Let's check out the relationship between a few more variables using scatter plots.
sns.scatterplot(data=df, x='curb_weight', y='engine_size')
<AxesSubplot:xlabel='curb_weight', ylabel='engine_size'>
From the above plot, we can say that these variables are positively correlated.
sns.scatterplot(data=df, x='bore', y='stroke')
<AxesSubplot:xlabel='bore', ylabel='stroke'>
Note:
We can see from the scatterplot of engine_size vs horsepower that there is a positive correlation between the two variables. Now, we want to measure the relationship between these two variables quantitatively and try to predict the 'horsepower' based on 'engine size'. This can be easily done by fitting a linear model. Here comes the seaborn *lmplot()* function to help us with that.
sns.lmplot(data=df, x='curb_weight', y='horsepower')
<seaborn.axisgrid.FacetGrid at 0x16c501a30>
We can also check the relationship between two variables for different categories by specifying the hue parameter.
sns.lmplot(data=df, x='curb_weight', y='horsepower', hue='fuel_type',ci=False)
<seaborn.axisgrid.FacetGrid at 0x16a24ad60>
We can also disable the 95% confidence interval and just keep the regression model fit by setting the ci parameter to False.
sns.lmplot(data=df, x='curb_weight', y='horsepower', ci=False)
<seaborn.axisgrid.FacetGrid at 0x16a1fe820>
We can also plot the levels of the third variable across different plots.
sns.lmplot(data=df, x='curb_weight', y='horsepower', col='fuel_type')
<seaborn.axisgrid.FacetGrid at 0x16a056cd0>
Let's check out the relationship between a few more variables using lmplot().
sns.lmplot(data=df, x='curb_weight', y='engine_size', col='number_of_doors', ci=False)
<seaborn.axisgrid.FacetGrid at 0x16a451cd0>
sns.lmplot(data=df, x='horsepower', y='price', ci=False)
<seaborn.axisgrid.FacetGrid at 0x16a2bda90>
We have seen that histograms help us understand the distribution of individual variables and scatterplots help us identify the relationship between two variables. While we can view and analyze these plots separately, having them together in a single visualization would allow us to capture a lot more information in a concise manner. That's where a jointplot comes to our aid.
sns.jointplot(data=df, x='engine_size', y='horsepower')
<seaborn.axisgrid.JointGrid at 0x16c606310>
One of the drawbacks of scatterplots is the overlapping of points. When we have large volumes of data to plot, the data points in the plot overlap with each other, making it difficult to interpret the data.
In such cases, we can divide the entire plot into bins using different shapes (square, triangle, hexagon, etc.) and then try to see the number of data points falling within each of the bins.
seaborn's jointplot() provides a 'hex' kind to plot the data in above-mentioned way.
We can create a hexbin plot by setting kind="hex".
sns.jointplot(data=df, x='engine_size', y='horsepower', kind="hex")
plt.colorbar(); # adds a separate axis indicating the color scale in this plot
We can also create a kde plot by setting kind="kde"..
sns.jointplot(data=df, x='engine_size', y='horsepower', kind="kde", fill=True)
<seaborn.axisgrid.JointGrid at 0x16c85dd90>
Let's check out the joint plots for a few more attributes in the data.
sns.jointplot(data=df, x='price', y='city_mpg', kind="reg")
<seaborn.axisgrid.JointGrid at 0x169f92430>
sns.violinplot(data=df, x='horsepower')
<AxesSubplot:xlabel='horsepower'>
We can get a vertical plot by setting orient parameter to 'v' and assigning a numeric variable to the y-axis.
sns.violinplot(data=df, x='fuel_type', y='horsepower', orient='v')
<AxesSubplot:xlabel='fuel_type', ylabel='horsepower'>
Let's check out the violinplots for a few more attributes in the data.
sns.violinplot(data=df, x='engine_size', y='fuel_type')
<AxesSubplot:xlabel='engine_size', ylabel='fuel_type'>
We can use the palette parameter to change the colour palette.
sns.violinplot(data=df, x='body_style', y='engine_size', palette="bright")
<AxesSubplot:xlabel='body_style', ylabel='engine_size'>
sns.violinplot(data=df, x='engine_location', y='price', palette="colorblind")
<AxesSubplot:xlabel='engine_location', ylabel='price'>
Histograms and other distribution plots are typically preferred for larger data sets. When we have a small dataset and we want to visualize the frequency distribution, a strip plot can be used.
sns.stripplot(data=df, x='engine_size')
<AxesSubplot:xlabel='engine_size'>
Strip plots are considered a good alternative to a box plot or a violin plot for comparing data distributions when we have fewer data points.
plt.figure(figsize=(15,7))
sns.stripplot(data=df, x='body_style', y='engine_size')
<AxesSubplot:xlabel='body_style', ylabel='engine_size'>
Strip plots are more useful when we add random noise called "jitter" to avoid overlapping of data points with same values.
plt.figure(figsize=(20,7))
sns.stripplot(data=df, x='body_style', y='engine_size', jitter=True)
<AxesSubplot:xlabel='body_style', ylabel='engine_size'>
We can set the hue parameter to display observations for subgroups.
plt.figure(figsize=(10,7))
sns.stripplot(data=df, x='fuel_type', y='engine_size', hue="number_of_doors", jitter=True)
<AxesSubplot:xlabel='fuel_type', ylabel='engine_size'>
Let's check out the strip plots for a few more attributes in the data.
plt.figure(figsize=(10,7))
sns.stripplot(data=df, x='number_of_doors', y='price',jitter=True)
<AxesSubplot:xlabel='number_of_doors', ylabel='price'>
plt.figure(figsize=(10,7))
sns.stripplot(data=df, x='number_of_doors', y='horsepower', jitter=True)
<AxesSubplot:xlabel='number_of_doors', ylabel='horsepower'>
In a strip plot, dots are randomly arranged in vertical direction, which means we can not use the width to estimate distribution. This can be easily done by swarm plots.
sns.swarmplot(data=df, x='number_of_doors', y='price')
<AxesSubplot:xlabel='number_of_doors', ylabel='price'>
We can add the hue parameter to swarmplot() and separate the categories.
sns.swarmplot(data=df, x='fuel_type', y='price', hue='number_of_doors')
<AxesSubplot:xlabel='fuel_type', ylabel='price'>
We can split the two products setting the dodge argument to True.
sns.swarmplot(data=df, x='fuel_type', y='price', hue='number_of_doors', dodge=True)
<AxesSubplot:xlabel='fuel_type', ylabel='price'>
sns.catplot(data=df, x='fuel_type', y='horsepower')
<seaborn.axisgrid.FacetGrid at 0x16cc7b880>
We can set kind='point' to create a pointplot using the catplot() function.
sns.catplot(data=df, x='body_style', y='horsepower', hue='fuel_type', kind='point')
<seaborn.axisgrid.FacetGrid at 0x16ca33a60>
Catplot is very useful to draw categorical plots onto a FacetGrid by assigning a third variable to the col parameter.
sns.catplot(data=df, x="fuel_type", y="horsepower", hue="number_of_doors", col="drive_wheels", kind='bar', palette='pastel')
<seaborn.axisgrid.FacetGrid at 0x16cd74f40>
Let's check out the catplot() function for a few more attributes in the data.
sns.catplot(data=df, x="fuel_type", y="engine_size", hue="body_style", col="number_of_doors", kind='box', palette='bright')
<seaborn.axisgrid.FacetGrid at 0x16cb92dc0>
sns.catplot(data=df, x="number_of_doors", y="price", hue="body_style", col="fuel_type", kind='swarm', palette='muted')
<seaborn.axisgrid.FacetGrid at 0x16cd7dd30>
sns.pairplot(data=df[['normalized_losses','wheel_base','curb_weight','engine_size','price','peak_rpm']])
<seaborn.axisgrid.PairGrid at 0x16cfc30d0>
We can add the hue parameter in pairplot to create a semantic mapping.
Also, we can add vars parameter to assign a list of variables from the dataset for which we want to create the pairplot.
sns.pairplot(data=df, vars=['wheel_base', 'curb_weight', 'engine_size', 'price'], hue='number_of_doors')
<seaborn.axisgrid.PairGrid at 0x16d143700>
We can set corner=True to plot only the lower triangle of a pairplot.
sns.pairplot(data=df, vars=['wheel_base', 'curb_weight', 'engine_size', 'price'], corner=True)
<seaborn.axisgrid.PairGrid at 0x16d143970>
sns.heatmap(data=df[['wheel_base','curb_weight','engine_size','price']].corr())
<AxesSubplot:>
We can set the annot parameter to True for displaying the numeric value in each cell.
sns.heatmap(data=df[['wheel_base','curb_weight','engine_size','price']].corr(), annot=True, cbar=False)
<Axes: >
We can apply a different colormap with the cmap parameter for better visual appeal.
sns.heatmap(data=df[['wheel_base','curb_weight','engine_size','price']].corr(), annot=True, cmap='YlGnBu')
<Axes: >
# let's start by installing plotly
#!pip install plotly
# !pip show plotly
Name: plotly Version: 5.16.1 Summary: An open-source, interactive data visualization library for Python Home-page: https://plotly.com/python/ Author: Chris P Author-email: chris@plot.ly License: MIT Location: /opt/homebrew/lib/python3.11/site-packages Requires: packaging, tenacity Required-by:
# importing plotly
import plotly.express as px
his = px.histogram(df, x="price")
his.show()
bar = px.bar(df, x='peak_rpm', y='horsepower')
bar.show()
scat = px.scatter(df, x='price', y='engine_size')
scat.show()
fig = px.box(df, x="fuel_type", y="horsepower", points="all")
fig.show()
fig_3d = px.scatter_3d(df, x='fuel_type', y='horsepower', z='price', color='horsepower')
fig_3d.show()
# to save the output to an HTML file
fig_3d.write_html("scatter_3d.html")
To change the axis scales of a plot:
plt.xlim(left, right) # here left and right indicates the min and the max limits of x-axis respectively
plt.ylim(bottom, top) # here bottom and top indicates the min and the max limits of y-axis respectively
To change the plot title and axis labels:
plt.title()
plt.xlabel() # for x_axis labels
plt.ylabel() # for y_axis labels
To show the grid lines of a plot:
sns.set(style="darkgrid")
sns.set(style="whitegrid")
sns.set(style="white")
To hide the axes of a plot:
plt.axis('off') # it will hide both the x-axis and y-axis
To customize labels of the plot legend:
ax.legend([]) # takes list of the legend values
To change the position of the plot legend:
plt.legend(loc=___)
The following values can be added to loc to change the location of the legend:
# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd
# Libraries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Command to tell Python to actually display the graphs
%matplotlib inline
Load The BookList Dataset
df = pd.read_csv('BookList.csv')
df.head()
# df.tail()
| Rating | Reviews | Book_title | Description | Number_Of_Pages | Type | Price | |
|---|---|---|---|---|---|---|---|
| 0 | 4.17 | 3829 | The Elements of Style | This style manual offers practical advice on i... | 105 | Hardcover | 9.323529 |
| 1 | 4.01 | 1406 | The Information: A History, a Theory, a Flood | James Gleick, the author of the best sellers C... | 527 | Hardcover | 11.000000 |
| 2 | 3.33 | 0 | Responsive Web Design Overview For Beginners | In Responsive Web Design Overview For Beginner... | 50 | Kindle Edition | 11.267647 |
| 3 | 3.97 | 1658 | Ghost in the Wires: My Adventures as the World... | If they were a hall of fame or shame for compu... | 393 | Hardcover | 12.873529 |
| 4 | 4.06 | 1325 | How Google Works | Both Eric Schmidt and Jonathan Rosenberg came ... | 305 | Kindle Edition | 13.164706 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 271 entries, 0 to 270 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Rating 271 non-null float64 1 Reviews 271 non-null int64 2 Book_title 271 non-null object 3 Description 271 non-null object 4 Number_Of_Pages 271 non-null int64 5 Type 271 non-null object 6 Price 271 non-null float64 dtypes: float64(2), int64(2), object(3) memory usage: 14.9+ KB
# Checking missing values
df.isna().sum()
Rating 0 Reviews 0 Book_title 0 Description 0 Number_Of_Pages 0 Type 0 Price 0 dtype: int64
sns.histplot(data=df, x='Price')
<Axes: xlabel='Price', ylabel='Count'>
df['Type']
0 Hardcover
1 Hardcover
2 Kindle Edition
3 Hardcover
4 Kindle Edition
...
266 Hardcover
267 Paperback
268 Boxed Set - Hardcover
269 Hardcover
270 Hardcover
Name: Type, Length: 271, dtype: object
plt.figure(figsize=(15,7))
sns.countplot(data=df, x='Type')
# plt.xticks(rotation=90)
<Axes: xlabel='Type', ylabel='count'>
plt.figure(figsize=(15,7))
sns.stripplot(data=df, x='Type', y='Rating', hue= 'Type')
<Axes: xlabel='Type', ylabel='Rating'>
# sns.boxplot(data=df, x='Rating')
sns.boxplot(data=df, x='Rating', orient='v', color='w', linewidth=1)
/opt/homebrew/lib/python3.11/site-packages/seaborn/_oldcore.py:1599: UserWarning: Vertical orientation ignored with only `x` specified.
<Axes: xlabel='Rating'>
sns.histplot(data=df, x='Reviews', kde=True)
<Axes: xlabel='Reviews', ylabel='Count'>
scat = px.scatter(df, x='Number_Of_Pages', y='Price')
scat.show()
scat = px.scatter(df, x='Price', y='Rating')
scat.show()
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 271 entries, 0 to 270 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Rating 271 non-null float64 1 Reviews 271 non-null int64 2 Book_title 271 non-null object 3 Description 271 non-null object 4 Number_Of_Pages 271 non-null int64 5 Type 271 non-null object 6 Price 271 non-null float64 dtypes: float64(2), int64(2), object(3) memory usage: 14.9+ KB
sns.heatmap(data=df[['Rating','Reviews','Number_Of_Pages','Price']].corr(), annot=True, cbar=True)
<Axes: >